Introduction

This document summarizes the microbiota analyses for the FEMAP+ACV study. There were 12 patients that participated in the study, but only 10 were complete with both “pre” and “post” samples (prior to and following 3 months of acetate supplementation, respectively). The two that were incomplete and not included in downstream analyses (at this time) were patients 003 and 004.

Initial Code

1. Load required packages

library(ALDEx2)
library(ARTool)
library(broom)
library(dplyr)
library(forcats)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(knitr)
library(Maaslin2)
library(microbiome)
library(patchwork)
library(phyloseq)
library(plyr)
library(RColorBrewer)
library(reshape2)
library(rlist)
library(sjmisc)
library(stringr)
library(tidyr)
library(zCompositions)

2. Import the files

counts <- read.table("data/cutadapt_counts2.txt", 
                     header = TRUE, row.names = 1, sep = "\t", check.names = FALSE, 
                     quote = "", stringsAsFactors = FALSE)
tax <- read.table("data/cutadapt_tax2.txt", 
                  header = TRUE, row.names = 1, sep = "\t", check.names = FALSE, 
                  quote = "", stringsAsFactors = FALSE)
meta<-read.table("data/metadata.txt", header=T, row.names = 1, sep='\t', comment.char = "")

3. Calculate alpha diversity on unfiltered counts table

# make a phyloseq object
tax<-as.matrix(tax) 
OTU = otu_table(counts, taxa_are_rows = FALSE)
TAX = tax_table(tax)
physeq = phyloseq(OTU, TAX)

From phyloseq R package, calculate various alpha diversity measures

div<- estimate_richness(physeq, split = TRUE, measures = NULL)

From microbiome R package, calculate various dominance indices

dom<- dominance(physeq)

Merge the data

div_all <- data.frame(div, dom)
rownames(div_all)<-gsub("X","", rownames(div_all))

div_merge<-merge(div_all, meta, by=0, all=FALSE)
div_merge2<-subset(div_merge, !is.na(timepoint))
div_merge2$participant<-as.factor(div_merge2$participant)
div_merge2<-div_merge2[!((div_merge2$Row.names) %in% c("003_post", "004_pre")),]

#write the alpha diversity measures into a new file
#write.table(div_merge2, file="data/alpha_diversity.txt", sep='\t', quote=F)

Now lets do some stats to compare pairwise between pre and post samples

# pairwise t-tests
# Select columns to test
cols_to_test <- c(2,3,5, 7:14, 16:17)
columns_to_test <- names(div_merge2)[cols_to_test]

tp_pre  <- grep("pre$",  unique(div_merge2$timepoint),  value = TRUE)
tp_post <- grep("post$", unique(div_merge2$timepoint), value = TRUE)

results <- data.frame()

for (col in columns_to_test) {
  pre_df  <- div_merge2[div_merge2$timepoint == tp_pre,  c("participant", col)]
  post_df <- div_merge2[div_merge2$timepoint == tp_post, c("participant", col)]
  merged <- merge(pre_df, post_df, by = "participant", suffixes = c("_pre", "_post"))
  pre_vals  <- merged[[paste0(col, "_pre")]]
  post_vals <- merged[[paste0(col, "_post")]]

  # paired Wilcoxon signed-rank test
  test_result <- wilcox.test(pre_vals, post_vals, paired = TRUE, exact = FALSE)

  # save results
  results <- rbind(
    results,
    data.frame(
      Variable = col,
      W_statistic = test_result$statistic,
      p_value = test_result$p.value
    )
  )
}

kable(results)
Variable W_statistic p_value
V Observed 36 0.4148231
V1 Chao1 34 0.5408179
V2 ACE 36 0.4148231
V3 Shannon 35 0.4755327
V4 Simpson 33 0.6102987
V5 InvSimpson 32 0.6834809
V6 Fisher 37 0.3589514
V7 dbp 23 0.6834809
V8 dmn 25 0.8384638
V9 absolute 27 1.0000000
V10 relative 23 0.6834809
V11 core_abundance 21 0.5408179
V12 gini 15 0.2212718

So there are no significant trends in alpha diversity pre- vs. post-intervention.

What about if we take into account the metabolic responder status? We can do this with a 2-way ANOVA.

# perform an Aligned Rank Transform (nonparametric alternative to 2-way ANOVA)
# Create the 'participant_id' column to group by
div_merge2$participant_id <- sub("_.*", "", div_merge2$Row.names)
div_merge2$participant_id<-as.factor(div_merge2$participant_id)
div_merge2$timepoint <- as.factor(div_merge2$timepoint)
div_merge2$metabolic_responder <- as.factor(div_merge2$metabolic_responder)

art_aov_results <- data.frame()

for (col in columns_to_test) {
  f <- as.formula(paste0(col, " ~ timepoint * metabolic_responder + Error(participant_id)"))
  art_model <- art(f, data = div_merge2)
  anova_res <- anova(art_model)
  anova_df <- tidy(anova_res)
  anova_df <- anova_df %>% filter(Term == "timepoint:metabolic_responder")
  anova_df$Metric <- col
  anova_df$term <- NULL
  anova_df <- anova_df %>% relocate(Metric)
  art_aov_results <- rbind(art_aov_results, anova_df)
}
## Warning: The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
kable(art_aov_results)
Metric Term Error df Df.res sumsq Sum.Sq.res meansq statistic p.value
Observed timepoint:metabolic_responder Within 1 8 145.8 206.65 145.8 5.6443262 0.0448401
Chao1 timepoint:metabolic_responder Within 1 8 145.8 188.40 145.8 6.1910828 0.0376283
ACE timepoint:metabolic_responder Within 1 8 145.8 199.00 145.8 5.8613065 0.0417840
Shannon timepoint:metabolic_responder Within 1 8 7.2 99.60 7.2 0.5783133 0.4687921
Simpson timepoint:metabolic_responder Within 1 8 3.2 134.80 3.2 0.1899110 0.6745105
InvSimpson timepoint:metabolic_responder Within 1 8 1.8 131.40 1.8 0.1095890 0.7491169
Fisher timepoint:metabolic_responder Within 1 8 156.8 159.00 156.8 7.8893082 0.0228856
dbp timepoint:metabolic_responder Within 1 8 0.2 227.00 0.2 0.0070485 0.9351550
dmn timepoint:metabolic_responder Within 1 8 1.8 152.00 1.8 0.0947368 0.7661004
absolute timepoint:metabolic_responder Within 1 8 7.2 195.60 7.2 0.2944785 0.6021506
relative timepoint:metabolic_responder Within 1 8 0.2 227.00 0.2 0.0070485 0.9351550
core_abundance timepoint:metabolic_responder Within 1 8 33.8 147.00 33.8 1.8394558 0.2120499
gini timepoint:metabolic_responder Within 1 8 33.8 87.00 33.8 3.1080460 0.1159221

Several metrics have P < 0.05, so there may be a responder-specific trend in alpha diversity changes to the intervention.

fisher<-ggplot(div_merge2, aes(x=timepoint, y=Fisher, group = metabolic_responder, colour = metabolic_responder)) +
  geom_point(aes(fill=metabolic_responder), size=4, shape=21, stroke=1, alpha=0.5) +
  geom_line(aes(group = participant), size = 1) +
  geom_text_repel(aes(label=participant), size=3) +
  scale_color_brewer(palette="Paired") +
  scale_fill_brewer(palette="Paired") +
  labs(y="Fisher's alpha ") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  theme_bw() +
  theme(axis.title.x = element_blank(), axis.title=element_text(size=10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fisher

4. Filter the counts tables for downstream analyses

# what are the dimensions of the original counts file?
dim(counts)
## [1]   23 2418
# Combine the counts and tax tables into one tidier working file
t.counts<-t(counts)
ct<-merge(t.counts, tax, by="row.names")

rownames(ct)<-ct$Row.names
ct$Row.names <- NULL
ct$Sequence <- NULL

ct<-unite(ct,"tax.vector", Kingdom:Species, sep=':', remove=TRUE)

dim(ct)
## [1] 2418   24

Check to see how many samples contain > 1000 total reads.

ct2<-ct[,1:ncol(ct)-1]
i <- (colSums(ct2) <=1000)
d.s <- ct2[, !i]
dim(d.s)
## [1] 2418   23
ncol(ct2)-ncol(d.s)
## [1] 0

So all the samples have > 1000 reads. Now lets apply some frequency-based cutoffs.

d.freq <- apply(d.s, 2, function(x) {x/sum(x)})

#keep SVs > 0.1% in any sample
d.0 <- d.s[apply(d.freq, 1, max)>0.001,]
dim(d.0)
## [1] 767  23

You can save tables with these filtering cutoffs.

# make relative abundance table
d.freq0 <- data.frame(apply(d.0, 2, function(x){x/sum(x)}))

#add taxonomy back in and save the filtered counts file
d.0$tax.vector = ct$tax.vector[match(rownames(d.0), rownames(ct))]
#write.table(d.0, file="data/dada2_tax_counts_001filt.txt", sep='\t', quote=F)

#add taxonomy back in and save the filtered abundance file
d.freq0$tax.vector = ct$tax.vector[match(rownames(d.freq0), rownames(ct))]
#write.table(d.freq0, file="data/dada2_tax_freq_001filt.txt", sep='\t', quote=F)

I also want to remove very rare/sparse SVs.

#filter SVs based on a read count cutoff
count = 100
d.2 <- data.frame(d.s[which(apply(d.s, 1, function(x){sum(x)}) > count),])
dim(d.2)
## [1] 516  23

Lets combine d.freq0 and d.2 so that we have a filtered table that contains only SVs present at 0.1% abundance AND >100 reads across all samples.

row_list<-intersect(rownames(d.0), rownames(d.2))
length(row_list)
## [1] 511
d_filt<-d.s[rownames(d.s) %in% row_list,]

#add taxonomy back in and save the filtered counts file
d_filt$tax.vector = ct$tax.vector[match(rownames(d_filt), rownames(ct))]
#write the file
#write.table(d_filt, file="data/counts_01abun_100filt_2.txt", sep='\t', quote=F)

d_filt_freq<-d.s[rownames(d.s) %in% row_list,]
d_filt_freq<-data.frame(apply(d_filt_freq, 2, function(x){x/sum(x)}))
d_filt_freq$tax.vector = ct$tax.vector[match(rownames(d_filt_freq), rownames(ct))]
#write.table(d_filt_freq, file="data/abundance_01abun_100filt.txt", sep='\t', quote=F)

These files will be the ones we primarily use downstream.

Figures

Figure 2. Alpha and beta diversity of the gut microbiota pre- and post-intervention.

2A. Relative abundance barplot

# Read input data
#d <- read.table("data/counts_01abun_100filt.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
#OR, rename the table made in the previous section to 'd'
d<-d_filt

# Extract and clean the taxonomic information
taxa <- data.frame(str_split_fixed(d$tax.vector, ":", 7))
taxa$Genus <- paste(taxa$X2, taxa$X3, taxa$X4, taxa$X5, taxa$X6, sep="_")

# Merge cleaned taxa info with count data
d1 <- cbind(d[,1:(ncol(d)-1)], Genus=taxa$Genus)

# Aggregate to the genus level
gen <- ddply(d1, "Genus", numcolwise(sum))
row.names(gen) <- gen$Genus
gen$Genus <- NULL

# Calculate the relative abundance
gen.f <- apply(gen, 2, function(x) x / sum(x))

# Order by abundance
y1 <- gen.f[order(rowSums(gen.f), decreasing = TRUE),]

We dont want to plot all the very rare taxa (below 1% abundance). We will put those in a separate group called the remainder.

# Filter taxa above 1% abundance
abund <- 0.01
keep.taxa.index <- rownames(y1[rowMeans(y1) > abund,])

# Retain only the relevant taxa and calculate the remainder
y3 <- as.data.frame(y1) %>% filter(rownames(.) %in% keep.taxa.index)
remainder <- colSums(y1[!rownames(y1) %in% keep.taxa.index, , drop = FALSE])
y3 <- rbind(y3, remainder)
rownames(y3)[nrow(y3)] <- "remainder"

# Remove the controls
y3<-as.data.frame(y3)
y3 <- dplyr::select(y3, -"DNA_BLANK2", -"PCR_BLANK2", -"POS")

# Convert to matrix and reshape for plotting
y3 <- as.matrix(y3)
melted <- melt(y3)
colnames(melted) <- c("Genus", "Sample", "value")

# Shorten genus names
gen_name <- data.frame(str_split_fixed(melted$Genus, "_", 5))
melted$Genus <- ifelse(gen_name$X5 == "", "Genera <1% abundance", gen_name$X5)

# Custom sample ordering
custom_order <- function(samples) {
  df <- data.frame(
    Sample = samples,
    Participant = gsub("_(pre|post)", "", samples),
    Visit = ifelse(grepl("_pre", samples), "A", "B")
  ) %>% arrange(Participant, Visit)
  return(df$Sample)
}
melted$Sample <- factor(melted$Sample, levels = custom_order(unique(melted$Sample)))

Make the plot

# Define color palettes
pal1 <- rep(c("lightgray", "#006F6B", "#00ADAB", "#ACDEE0", "#3E783A", "#76BB47", "#AAD486",
              "#CBC02D", "#FCF281", "#C36928", "#F58123", "#F8A96E", "#F9DDCA", "#851719",
              "#B01F24", "#ED2027", "#F3786D", "#4A2970", "#7E6BAD", "#B8ACD1", "#A71D47",
              "#ED1F6B"))
pal2 <- rlist::list.reverse(pal1)

# Plot with faceting
melted$Sample <- gsub("X", "", melted$Sample)
melted$Participant <- sub("_.*", "", melted$Sample)
melted$Timepoint <- sub(".*_", "", melted$Sample)
melted$Timepoint <- gsub("pre", "Pre", melted$Timepoint)
melted$Timepoint <- gsub("post", "Post", melted$Timepoint)
melted$Timepoint <- factor(melted$Timepoint, levels = c("Pre", "Post"))

bars <- ggplot(melted, aes(x=Timepoint, y=value, fill=fct_reorder(Genus, value))) +
  geom_bar(stat="identity", position="stack") + 
  scale_fill_manual(values=pal2) +
  facet_grid(~Participant, scales="free_x") +
  ylab("Relative Abundance") +
  guides(fill = guide_legend(ncol=2, reverse=T, title="Genus")) +
  theme_bw() +
  theme(axis.text.x=element_text(size=8), legend.position="right", axis.title.x=element_blank(), legend.text=element_text(size=8, face="italic"), legend.title=element_text(size=10), axis.title=element_text(size=10))
bars

2B. PCA plot

tax1 <- d$tax.vector

# Define the columns to remove (controls)
cols_to_remove <- c("DNA_BLANK2", "PCR_BLANK2", "POS")

# Remove the columns if they exist in the data frame
d <- d[, !(colnames(d) %in% cols_to_remove)]

# Split to the 6th taxonomic level -> genus (separated by :)
split6 <- sapply(strsplit(as.character(tax1), ":"), "[", 6)
split6 <- as.data.frame(split6)
rownames(split6)<-rownames(d)
split6$sv<-rownames(d)
split6$sv_split6<-paste(split6$sv, split6$split6, sep='_')
rownames(split6)<-rownames(d)

#get only the count columns
dm <- d[,1:ncol(d)-1]

gen.f <- apply(dm, 2, function(x) {x/sum(x)})
colSums(gen.f) #check all sum to 1
## 001_post  001_pre 002_post  002_pre 005_post  005_pre 006_post  006_pre 
##        1        1        1        1        1        1        1        1 
## 007_post  007_pre 008_post  008_pre 009_post  009_pre 010_post  010_pre 
##        1        1        1        1        1        1        1        1 
## 011_post  011_pre 012_post  012_pre 
##        1        1        1        1

Apply compositional data transformation (CZM method) and CLR transformation.

d.czm <- cmultRepl(t(gen.f),  label=0, method="CZM")
## No. adjusted imputations:  2950
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))

Compute Aitchison distances, which will be used in Fig 2C and D.

aitch_dists <- as.matrix(dist(d.clr))
#write.table(aitch_dists, file="data/aitchdist_filtered.txt", sep="\t", quote=F)

Perform PCA

d.pcx <- prcomp(d.clr)

# Define parameters for PCA
sv_positions <- data.frame(d.pcx[["rotation"]])
# Merge on genus table
sv_pos <- merge(sv_positions, split6, by = 0)
# Calculate Euclidean distance
arrow_len <- function(x, y) {
  sqrt((x - 0)^2 + (y - 0)^2)
}
sv_pos_dist <- mapply(arrow_len, sv_pos$PC1, sv_pos$PC2)
sv_pos_dist <- as.data.frame(cbind(sv_pos_dist, sv_pos$Row.names, sv_pos$sv_split6, sv_pos$PC1, sv_pos$PC2))
colnames(sv_pos_dist) <- c("Distance", "SV", "Genus", "PC1", "PC2")
# Filter arrow based on distance
filter <- sv_pos_dist %>% filter(Distance >= 0.15)

d.mvar <- sum(d.pcx$sdev^2)
# Calculate the PC1 and PC2 variance
PC1 <- paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3))
PC2 <- paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3))

metadata<-tibble::rownames_to_column(meta, "SampleID")

loadings<- data.frame(Variables=rownames(d.pcx$rotation), d.pcx$rotation)
values<-merge(d.pcx$x[,c(1,2)], metadata[,c("SampleID","participant","timepoint","metabolic_responder")],
              by.x="row.names", by.y="SampleID", all=F)

# Remove unwanted rows
values2 <- values[!(values$Row.names %in% c("003_post", "004_pre","DNA_BLANK2","PCR_BLANK2")), ]
values2$time<-gsub("._","",values2$timepoint)

# Convert participant to a factor so it's treated as discrete
values2$participant <- as.factor(values2$participant)

Make the plot!

p <- ggplot(values2, aes(x = PC1, y = PC2)) +
  geom_segment(data = sv_pos, aes(x = 0, y = 0, xend = 50 * PC1, yend = 50 * PC2),
               arrow = arrow(length = unit(1/2, 'picas')),
               color = "grey69", alpha = 0.8, size = 0.15) + # Plot features
  geom_text(data = filter, aes(x = 50 * as.numeric(PC1), y = 50 * as.numeric(PC2)), 
            nudge_x = 0.25, nudge_y = 0.25, label = filter$Genus, 
            color = "grey69", size = 3, fontface = "italic") +
  geom_point(aes(colour = participant), size = 6) +  # Points colored by participant (Viridis)
  geom_text(aes(label = time), size = 2.5) +
  scale_color_brewer(palette="Paired") +
  xlab(paste0("PC1: ", round(100 * (d.pcx$sdev[1]^2 / sum(d.pcx$sdev^2)), 1), "%")) +
  ylab(paste0("PC2: ", round(100 * (d.pcx$sdev[2]^2 / sum(d.pcx$sdev^2)), 1), "%")) +
  labs(colour="Participant") + 
  theme_bw() +
  theme(legend.position = c(0.1, 0.67), legend.background = element_blank(), legend.key = element_blank(), axis.title=element_text(size=10), legend.title = element_text(size=10))
p

2C. Bar plot of pre vs post aitchison distance

#dm<-read.table("data/aitchdist_filtered.txt", sep='\t', header=TRUE, row.names=1, quote="")
#OR
dm<-aitch_dists

#remove the X from the colnames, if they're there
#colnames(dm)<-gsub("X","", colnames(dm))

intra_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
  for(j in 1:ncol(dm)) {
    currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
    currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
    
    # Apply the condition: same identifier (currentrow[1] == currentcol[1]), pre in row, not equal in the second part
    if(currentrow[1] == currentcol[1] 
       && currentrow[2] != currentcol[2] 
       && currentrow[2] == "pre") {
      # Store the value into the list
      intra_values[[currentrow[1]]] <- c(intra_values[[currentrow[1]]], dm[i, j])
    }
  }
}
# Convert the list into a data frame and set appropriate column names
intra_df <- as.data.frame(intra_values)
time_test<-t(intra_df)

colnames(time_test)<-"aitch"
rownames(time_test)<-gsub("X","", rownames(time_test))
rownames(time_test)<-paste0(rownames(time_test), "_post")

dist_merge<-merge(meta, time_test, by=0, all=FALSE)
dist_merge$participant<-as.factor(dist_merge$participant)

aitch_plot <- ggplot(dist_merge, aes(x = participant, y = aitch, fill = factor(participant))) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Paired") +
  labs(y = "Pre vs. Post Aitchison Distance", x = "Participant") +
  theme_bw() +
  theme(legend.position = "none", axis.title = element_text(size = 10), axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +  # Remove x-axis labels
  geom_text(aes(x = participant, y = 2, label = participant), 
            color = "white", size = 3, inherit.aes = FALSE)  # Adjust size as needed

aitch_plot

2D. Box and violin plot of intra- and interindividual aitchison distance comparisons

interindiv_pre_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
  for(j in 1:ncol(dm)) {
    currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
    currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
    
    # Apply the condition: same identifier (currentrow[1] == currentcol[1]), pre in row, not equal in the second part
    if(currentrow[1] != currentcol[1] 
       && currentrow[2] == currentcol[2] 
       && currentrow[2] == "pre") {
      # Store the value into the list
      interindiv_pre_values[[currentrow[1]]] <- c(interindiv_pre_values[[currentrow[1]]], dm[i, j])
    }
  }
}

interindiv_post_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
  for(j in 1:ncol(dm)) {
    currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
    currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
    
    # Apply the condition: same identifier (currentrow[1] == currentcol[1]), post in row, not equal in the second part
    if(currentrow[1] != currentcol[1] 
       && currentrow[2] == currentcol[2] 
       && currentrow[2] == "post") {
      # Store the value into the list
      interindiv_post_values[[currentrow[1]]] <- c(interindiv_post_values[[currentrow[1]]], dm[i, j])
    }
  }
}

# Convert lists to data frames
intra_df <- data.frame(Distance = unlist(intra_values), Category = "Intra")
inter_pre_df <- data.frame(Distance = unlist(interindiv_pre_values), Category = "Inter-Pre")
inter_post_df <- data.frame(Distance = unlist(interindiv_post_values), Category = "Inter-Post")

distance_data <- bind_rows(intra_df, inter_pre_df, inter_post_df) # Combine data frames

distance_data$Category <- factor(distance_data$Category, levels = c("Intra", "Inter-Pre", "Inter-Post"))

dist_comp<-ggplot(distance_data, aes(x = Category, y = Distance)) +
  geom_violin(trim = FALSE, fill="lightgrey", alpha = 0.5) + 
  geom_boxplot(width = 0.2, outlier.shape = NA, color = "black") +  # Add boxplot inside violin
  theme_bw() +
  #scale_fill_manual(values = c("Intra" = "#1f77b4", "Inter-Pre" = "#ff7f0e", "Inter-Post" = "#2ca02c")) +  # Custom colors
  labs(y = "Aitchison Distance") +
  theme(legend.position = "none", text = element_text(size = 10), axis.title.x = element_blank())

dist_comp

2E and F, alpha diversity comparisons

#div<-read.table("data/alpha_diversity.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
#OR
div<-div_merge2
div$participant<-as.factor(div$participant)
div<-div[!((div$Row.names) %in% c("003_post", "004_pre")),]

shan<-ggplot(div, aes(x=timepoint, y=Shannon, group = participant, colour = participant)) +
  geom_point(aes(fill=participant), size=4, shape=21, stroke=1, alpha=0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette="Paired") +
  scale_fill_brewer(palette="Paired") +
  labs(y="Shannon's Index") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position ="none", axis.title=element_text(size=10))
shan

core<-ggplot(div, aes(x=timepoint, y=core_abundance, group = participant, colour = participant)) +
  geom_point(aes(fill=participant), size=4, shape=21, stroke=1, alpha=0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette="Paired") +
  scale_fill_brewer(palette="Paired") +
  labs(y="Core Abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position="none", axis.title=element_text(size=10))
core

Build the figure panel

lay<-rbind(c(1,1,1,1),c(1,1,1,1),c(2,2,3,4),c(2,2,5,6))
grid.arrange(bars, p, aitch_plot, dist_comp, shan, core, layout_matrix=lay)

Figure 3. Differential abundance comparisons.

3A. Differential abundance (taxonomy) heatmap

Specify the comparison groups of samples:

pre<-c("001_pre","002_pre","005_pre","006_pre","007_pre","008_pre","009_pre","010_pre","011_pre","012_pre")
post<-c("001_post","002_post","005_post","006_post","007_post","008_post","009_post","010_post","011_post","012_post")

Perform an aldex2 t-test and effect size test of paired pre- vs post-intervention samples.

aldex.in<-d[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

#merge the data
x.all <- data.frame(x.tt, x.effect)
tax<-as.data.frame(tax)
#merge taxonomic information
x.all.tax.sv <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)
#write a .txt file with the results
#write.table(x.all.tax, file="aldex_output/pre_vspost_SV.txt", sep="\t", quote=F, col.names=NA)

Make the heatmap

#aldex_out<-read.table("aldex_output/pre_vspost_SV.txt", sep="\t", quote="", header=T, row.names=1)
#OR
aldex_out<-x.all.tax.sv
filt_effect <- aldex_out[abs(aldex_out$effect) > 0.5, ]

# relative abundance
abun <- apply(aldex.in, 2, function(x) {x/sum(x)})
colSums(abun) #check all sum to 1
##  001_pre  002_pre  005_pre  006_pre  007_pre  008_pre  009_pre  010_pre 
##        1        1        1        1        1        1        1        1 
##  011_pre  012_pre 001_post 002_post 005_post 006_post 007_post 008_post 
##        1        1        1        1        1        1        1        1 
## 009_post 010_post 011_post 012_post 
##        1        1        1        1
#transpose to samples as rows
abun.t<-t(abun)
d.czm <- cmultRepl(abun.t,  label=0, method="CZM", z.warning = 0.95)
## No. adjusted imputations:  6601
# The table needs to be transposed again (samples as COLUMNS)
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))

#only get the differentially abundant SV
d.clr.diff<-d.clr[,colnames(d.clr) %in% filt_effect$Row.names]

clr_melt<-melt(d.clr.diff)
colnames(clr_melt)<-c("sample","SV","clr")

#add a participant and timepoint column
clr_melt$participant<-gsub("_.*","", clr_melt$sample)
clr_melt$timepoint<-gsub(".*_","",clr_melt$sample)
clr_melt$timepoint<-gsub("pre","1pre",clr_melt$timepoint)

filt_effect <- filt_effect %>%
  mutate(bugnames = ifelse(!is.na(Species), paste(Row.names, Genus, Species, sep = "_"), paste(Row.names, Genus, sep = "_")))

filt_effect<- filt_effect %>% arrange(effect)

clr_final <- clr_melt %>%
  left_join(filt_effect %>% dplyr::select(Row.names, bugnames), by = c("SV" = "Row.names"))

clr_final <- clr_final %>%
  left_join(filt_effect %>% dplyr::select(Row.names, effect), by =c("SV"="Row.names"))

clr_final$effect<-clr_final$effect * -1
clr_final <- clr_final %>%
  arrange(effect) %>%
  mutate(bugnames = factor(bugnames, levels = unique(bugnames)))  # Preserve order in factor

filt_effect$effect<-filt_effect$effect *-1

filt_effect<-filt_effect %>% mutate(bugnames = factor(bugnames, levels = unique(bugnames)))

Make the plot

heat <- ggplot(clr_final, aes(x = timepoint, y = bugnames, fill = clr)) +
  geom_tile() +
  facet_grid(~participant, scales = "free_x") +
  scale_fill_gradientn(
    colors = rev(c("#98033A","#D43747","#F76340","#FEA55C","#FEFFBA","white","#9CDA9C","#267AB1","#5a78b5","#554394")),
    values = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), # Keeping the same values
    limits = range(clr_final$clr)) +
  scale_x_discrete(labels=c("1pre"="Pre", "post"="Post")) +
  theme_bw() +
  theme(axis.title = element_blank(), 
        legend.position = c(-0.2, 1.05), 
        legend.direction = "horizontal",
        axis.text.y=element_text(face="italic"),
        axis.text.x = element_text(size=8))
heat

This heatmap will look slightly different every time a new aldex ttest and effect size test is run, because the results are based on estimates.

bars <- ggplot(filt_effect, aes(x = effect, y = bugnames, fill = ifelse(effect > 0.5, "salmon", "#5a78b5"))) +
  geom_col() +
  scale_fill_identity() +  # Directly apply colors without expecting discrete categories
  geom_text(aes(label = round(effect, 2), x = ifelse(effect > 0.5, 0.1, -0.1)),  # Position text dynamically
            color = "black", hjust = ifelse(filt_effect$effect > 0.5, 0, 1), size = 3) +  # Adjust text alignment
  scale_y_discrete(limits = rev(unique(filt_effect$bugnames))) +  # Keep order consistent with heatmap
  theme_minimal() +
  xlab("Effect Size") +
  theme(
    axis.text.y = element_blank(),  # Hide y-axis text to avoid duplication
    axis.title.y = element_blank(),
    axis.text.x = element_blank(),
    panel.grid = element_blank())

bars

final_plot<- heat + bars + plot_layout(widths = c(4, 1))
final_plot

3B. Differential abundance of EC numbers

Perform an aldex2 t-test and effect size test of paired pre- vs post-intervention samples on microbial functional output tables.

First up, EC numbers.

ec<-read.table("data/pred_metagenome_unstrat_EC.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
ec<-round(ec, digits=0)

# pre vs post all samples
aldex.in.ec<-ec[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.ec <- aldex.clr(aldex.in.ec, conds, mc.samples=128, verbose=TRUE)
x.tt.ec <- aldex.ttest(x.ec, paired.test=TRUE)
x.effect.ec <- aldex.effect(x.ec)

#merge the data
x.all.ec <- data.frame(x.tt.ec, x.effect.ec)
#write.table(x.all.ec, file="aldex_output/prevspost_ECnumbers.txt", sep='\t', quote=F)

Next up, KOs.

ko<-read.table("data/pred_metagenome_unstrat_KO.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
ko<-round(ko, digits=0)

# pre vs post all samples
aldex.in.ko<-ko[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.ko <- aldex.clr(aldex.in.ko, conds, mc.samples=128, verbose=TRUE)
x.tt.ko <- aldex.ttest(x.ko, paired.test=TRUE)
x.effect.ko <- aldex.effect(x.ko)

#merge the data
x.all.ko <- data.frame(x.tt.ko, x.effect.ko)
#write.table(x.all.ko, file="aldex_output/prevspost_KOnumbers.txt", sep='\t', quote=F)

And finally, at the functional pathway level.

p<-read.table("data/path_abun_unstrat.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
p<-round(p, digits=0)

# pre vs post all samples
aldex.in.p<-p[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.p <- aldex.clr(aldex.in.p, conds, mc.samples=128, verbose=TRUE)
x.tt.p <- aldex.ttest(x.p, paired.test=TRUE)
x.effect.p <- aldex.effect(x.p)

#merge the data
x.all.p <- data.frame(x.tt.p, x.effect.p)
#write.table(x.all.p, file="aldex_output/prevspost_pathways.txt", sep='\t', quote=F)

Now lets make some volcano plots.

x.all.ec$effect<-x.all.ec$effect * -1
ec_num<-x.all.ec

#colours based on significance!
# add a column
ec_num$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
ec_num$diffexpressed[ec_num$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
ec_num$diffexpressed[ec_num$effect < -0.5] <- "DOWN"

#labels based on significance!
ec_num$delabel <- NA
#ec_num$delabel[ec_num$diffexpressed != "NO"] <- rownames(ec_num)[ec_num$diffexpressed != "NO"]
ec_num$delabel[ec_num$effect > 0.65 | ec_num$effect < -0.6] <- rownames(ec_num)[ec_num$effect > 0.65 | ec_num$effect < -0.65]

ec_volcano <- ggplot(data=ec_num, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)") +
  geom_text_repel(aes(label = delabel), size = 3, max.overlaps = 22)
ec_volcano  
## Warning: Removed 1646 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

3C Differential abundance of KOs

x.all.ko$effect<-x.all.ko$effect * -1
ko_num<-x.all.ko

#colours based on significance!
# add a column
ko_num$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
ko_num$diffexpressed[ko_num$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
ko_num$diffexpressed[ko_num$effect < -0.5] <- "DOWN"

#labels based on significance!
ko_num$delabel <- NA
#ko_num$delabel[ko_num$diffexpressed != "NO"] <- rownames(ko_num)[ko_num$diffexpressed != "NO"]
ko_num$delabel[ko_num$effect > 0.65 | ko_num$effect < -0.6] <- rownames(ko_num)[ko_num$effect > 0.65 | ko_num$effect < -0.65]
## Warning in ko_num$delabel[ko_num$effect > 0.65 | ko_num$effect < -0.6] <-
## rownames(ko_num)[ko_num$effect > : number of items to replace is not a multiple
## of replacement length
ko_volcano <- ggplot(data=ko_num, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)") +
  geom_text_repel(aes(label = delabel), size = 3, max.overlaps = 22)
ko_volcano
## Warning: Removed 5271 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

3D-H. Differential abundance of functional pathways

# relative abundance
p.f <- apply(p, 2, function(x) {x/sum(x)})
colSums(p.f) #check all sum to 1
##   001_post    001_pre   002_post    002_pre   003_post    004_pre   005_post 
##          1          1          1          1          1          1          1 
##    005_pre   006_post    006_pre   007_post    007_pre   008_post    008_pre 
##          1          1          1          1          1          1          1 
##   009_post    009_pre   010_post    010_pre   011_post    011_pre   012_post 
##          1          1          1          1          1          1          1 
##    012_pre DNA_BLANK2 PCR_BLANK2 
##          1          1          1
to.remove<-c("003_post","004_pre","DNA_BLANK2","PCR_BLANK2")

p.f2<-p.f[,!colnames(p.f) %in% to.remove]

#transpose to samples as rows
p.f3<-t(p.f2)
#samples must be as rows
p.czm <- cmultRepl(p.f3,  label=0, method="CZM")
## No. adjusted imputations:  222
p.clr <- t(apply(p.czm, 1, function(x){log(x) - mean(log(x))}))

meta<-read.table("data/metadata.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")

p.clr.m<-merge(p.clr, meta, by=0, all=FALSE)
p.clr.m$participant<-as.factor(p.clr.m$participant)

p1 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[163]], group = participant, colour = participant)) +
  geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette = "Paired") +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "PWY-5913", y = "CLR relative abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  theme_bw() +
  annotate("text", x = 1.5, y = 1.25, label = "ES = -0.65", size = 3) +
  theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))

p2 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[250]], group = participant, colour = participant)) +
  geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette = "Paired") +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "PWY-7328", y = "CLR relative abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  annotate("text", x = 1.5, y = 0.5, label = "ES = -0.61", size = 3) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))

p3 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[108]], group = participant, colour = participant)) +
  geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette = "Paired") +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "PWY-3781", y = "CLR relative abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  annotate("text", x = 1.5, y = 0.5, label = "ES = -0.58", size = 3) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))

p4 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[80]], group = participant, colour = participant)) +
  geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette = "Paired") +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "P125-PWY", y = "CLR relative abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  annotate("text", x = 1.5, y = 0.5, label = "ES = -0.57", size = 3) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))

p5 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[64]], group = participant, colour = participant)) +
  geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
  geom_line(aes(group = participant), size = 1) +
  scale_color_brewer(palette = "Paired") +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "LACTOSECAT-PWY", y = "CLR relative abundance") +
  scale_x_discrete(labels = c("Pre", "Post")) +
  annotate("text", x = 1.5, y = 0.5, label = "ES = -0.55", size = 3) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))

Make the final figure panel including Figure 3 B-H

layout <- "
AAAACDE
BBBBFG#
"

ec_volcano + ko_volcano + p1 + p2 + p3 + p4 + p5 +
  plot_layout(design = layout)

Supplementary Figures

Supplementary Figure 1. Differentially abundant taxa and functional features between metabolic responders and non-responders.

Specify the sample comparison groups:

metR_pre<-c("001_pre","005_pre","007_pre","009_pre","011_pre")
metR_post<-c("001_post","005_post","007_post","009_post","011_post")

metNR_pre<-c("002_pre","006_pre","008_pre","010_pre","012_pre")
metNR_post<-c("002_post","006_post","008_post","010_post","012_post")

Perform an aldex2 t-test and effect size test of un paired pre-intervention R vs NR samples.

First, taxa (SV).

aldex.in<-d[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

x.all <- data.frame(x.tt, x.effect) #merge the data
tax<-as.data.frame(tax)
x.all.tax.sv.pre <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)

#write.table(x.all.tax.sv.pre, file="aldex_output/metR_vsNR_preSV2.txt", sep="\t", quote=F, col.names=NA)

Make the volcano plot based on these comparisons.

pre.sv<-x.all.tax.sv.pre
# add a column for labels based on significance
pre.sv$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
pre.sv$diffexpressed[pre.sv$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.sv$diffexpressed[pre.sv$effect < -0.5] <- "DOWN"
pre.sv$delabel <- NA
rownames(pre.sv) <-pre.sv[,1]
pre.sv$delabel[pre.sv$diffexpressed != "NO"] <- rownames(pre.sv)[pre.sv$diffexpressed != "NO"]

pre.sv.v <- ggplot(data=pre.sv, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-d[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

x.all <- data.frame(x.tt, x.effect) #merge the data
tax<-as.data.frame(tax)
x.all.tax.sv.post <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)

#write.table(x.all.tax.sv.post, file="metR_vsNR_postSV2.txt", sep="\t", quote=F, col.names=NA)
post.sv<-x.all.tax.sv.post
# add a column for labels based on significance
post.sv$diffexpostssed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
post.sv$diffexpostssed[post.sv$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.sv$diffexpostssed[post.sv$effect < -0.5] <- "DOWN"
post.sv$delabel <- NA
rownames(post.sv) <-post.sv[,1]
post.sv$delabel[post.sv$diffexpostssed != "NO"] <- rownames(post.sv)[post.sv$diffexpostssed != "NO"]

post.sv.v <- ggplot(data=post.sv, aes(x=effect, y=-log10(wi.ep), col=diffexpostssed)) +
  geom_point(aes(alpha = ifelse(diffexpostssed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")

Repeat with functional EC values.

aldex.in<-ec[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

pre.ec <- data.frame(x.tt, x.effect) #merge the data

#write.table(pre.ec, file="metR_vsNR_preEC2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
pre.ec$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
pre.ec$diffexpressed[pre.ec$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.ec$diffexpressed[pre.ec$effect < -0.5] <- "DOWN"
pre.ec$delabel <- NA
pre.ec$delabel[pre.ec$diffexpressed != "NO"] <- rownames(pre.ec)[pre.ec$diffexpressed != "NO"]

pre.ec.v <- ggplot(data=pre.ec, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-ec[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

post.ec <- data.frame(x.tt, x.effect) #merge the data

#write.table(post.ec, file="metR_vsNR_postEC2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
post.ec$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
post.ec$diffexpressed[post.ec$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.ec$diffexpressed[post.ec$effect < -0.5] <- "DOWN"
post.ec$delabel <- NA
post.ec$delabel[post.ec$diffexpressed != "NO"] <- rownames(post.ec)[post.ec$diffexpressed != "NO"]

post.ec.v <- ggplot(data=post.ec, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")

Repeat with functional pathways.

aldex.in<-p[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

pre.paths <- data.frame(x.tt, x.effect) #merge the data

#write.table(pre.paths, file="metR_vsNR_prePaths2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
pre.paths$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
pre.paths$diffexpressed[pre.paths$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.paths$diffexpressed[pre.paths$effect < -0.5] <- "DOWN"
pre.paths$delabel <- NA
pre.paths$delabel[pre.paths$diffexpressed != "NO"] <- rownames(pre.paths)[pre.paths$diffexpressed != "NO"]

pre.paths.v <- ggplot(data=pre.paths, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-p[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)

post.paths <- data.frame(x.tt, x.effect) #merge the data

#write.table(post.paths, file="metR_vsNR_postPaths2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
post.paths$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP" 
post.paths$diffexpressed[post.paths$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.paths$diffexpressed[post.paths$effect < -0.5] <- "DOWN"
post.paths$delabel <- NA
post.paths$delabel[post.paths$diffexpressed != "NO"] <- rownames(post.paths)[post.paths$diffexpressed != "NO"]

post.paths.v <- ggplot(data=post.paths, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
  geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
  theme_bw() +
  scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
  theme(legend.position="none") +
  labs(x="Effect Size", y="Wilcoxon P (-log10)")

Make the figure panel

grid.arrange(pre.sv.v, post.sv.v, pre.ec.v, post.ec.v, pre.paths.v, post.paths.v, nrow=3)

Compare the features (ASVs, ECs, pathways) across timepoints.

# Convert row names to a column called Row.names for .ec and .p files
pre.ec$Row.names <- rownames(pre.ec)
post.ec$Row.names <- rownames(post.ec)

pre.paths$Row.names <- rownames(pre.paths)
post.paths$Row.names <- rownames(post.paths)

# Function to create comparison table
generate_comparison_table <- function(pre, post) {
  # Extract negative effect features
  pre.neg <- pre$Row.names[pre$effect < -0.5]
  post.neg <- post$Row.names[post$effect < -0.5]
  
  # Find common and unique features (negative effect)
  common.neg <- intersect(pre.neg, post.neg)
  unique.pre.neg <- setdiff(pre.neg, post.neg)
  unique.post.neg <- setdiff(post.neg, pre.neg)
  
  # Extract positive effect features
  pre.pos <- pre$Row.names[pre$effect > 0.5]
  post.pos <- post$Row.names[post$effect > 0.5]
  
  # Find common and unique features (positive effect)
  common.pos <- intersect(pre.pos, post.pos)
  unique.pre.pos <- setdiff(pre.pos, post.pos)
  unique.post.pos <- setdiff(post.pos, pre.pos)
  
  # Determine the maximum number of rows needed
  max_length <- max(length(common.neg), length(unique.pre.neg), length(unique.post.neg), 
                    length(common.pos), length(unique.pre.pos), length(unique.post.pos))
  
  # Pad each vector with NA to match max_length
  data.frame(
    Common_Negative = c(common.neg, rep(NA, max_length - length(common.neg))),
    Unique_Pre_Neg = c(unique.pre.neg, rep(NA, max_length - length(unique.pre.neg))),
    Unique_Post_Neg = c(unique.post.neg, rep(NA, max_length - length(unique.post.neg))),
    Common_Positive = c(common.pos, rep(NA, max_length - length(common.pos))),
    Unique_Pre_Pos = c(unique.pre.pos, rep(NA, max_length - length(unique.pre.pos))),
    Unique_Post_Pos = c(unique.post.pos, rep(NA, max_length - length(unique.post.pos)))
  )
}

# Generate comparison tables
comparison_table_sv <- generate_comparison_table(pre.sv, post.sv)
comparison_table_ec <- generate_comparison_table(pre.ec, post.ec)
comparison_table_paths <- generate_comparison_table(pre.paths, post.paths)

# Save results
#write.table(comparison_table_sv, "RvsNRcomparisons_sv.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(comparison_table_ec, "RvsNRcomparisons_ec.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(comparison_table_paths, "RvsNRcomparisons_paths.txt", sep="\t", row.names=FALSE, quote=FALSE)

Additional Exploratory Analyses

Delta CLR - correlation with baseline metadata features

Calculate the deltaCLR (CLR post - CLR pre)

d.clr <- as.data.frame(d.clr)
d.clr$id <- rownames(d.clr)

# Split into pre and post tables
d.pre <- d.clr[grepl("_pre$", d.clr$id), ]
d.post <- d.clr[grepl("_post$", d.clr$id), ]

# Extract individual IDs
d.pre$participant <- sub("_pre$", "", d.pre$id)
d.post$participant <- sub("_post$", "", d.post$id)

# Make sure features are only the columns with CLR values
clr.cols <- setdiff(colnames(d.clr), "id")

# Merge pre and post by subject
d.delta <- merge(d.post[, c("participant", clr.cols)],
                 d.pre[, c("participant", clr.cols)],
                 by = "participant",
                 suffixes = c("_post", "_pre"))

# Calculate delta CLR (post - pre)
delta.cols <- paste0(clr.cols, "_delta")
d.delta[delta.cols] <- d.delta[paste0(clr.cols, "_post")] - d.delta[paste0(clr.cols, "_pre")]

# Keep only subject and delta CLR columns
d.delta.clr <- d.delta[, c("participant", delta.cols)]

# Remove leading zeros
d.delta.clr$participant <- sub("^0+", "", d.delta.clr$participant)

# merge with metadata
delta_meta <- merge(meta, d.delta.clr, by = "participant", all = FALSE)

deltas <- c(39:542)
delta_cols <- names(delta_meta)[deltas]

delta_meta <- delta_meta[delta_meta$timepoint!= "2_post",]

#perform wilcoxon test of delta CLR values between R vs NR
#Ensure grouping variable is a factor
delta_meta$metabolic_responder <- as.factor(delta_meta$metabolic_responder)

wilcox_results <- lapply(delta_cols, function(col) {
  
  # Skip non-numeric or zero-variance features
  if (!is.numeric(delta_meta[[col]]) ||
      length(unique(na.omit(delta_meta[[col]]))) < 2) {
    return(NULL)
  }
  
  # Wilcoxon test
  wt <- wilcox.test(
    delta_meta[[col]] ~ delta_meta$metabolic_responder,
    exact = FALSE,
    na.action = na.omit
  )
  
  # Group means
  mean_NR <- mean(delta_meta[[col]][delta_meta$metabolic_responder == "NR"], na.rm = TRUE)
  mean_R  <- mean(delta_meta[[col]][delta_meta$metabolic_responder == "R"],  na.rm = TRUE)
  
  data.frame(
    feature = col,
    mean_NR = mean_NR,
    mean_R  = mean_R,
    W = wt$statistic,
    p_value = wt$p.value
  )
})
wilcox_results <- do.call(rbind, wilcox_results)

# Multiple testing correction
wilcox_results$padj <- p.adjust(wilcox_results$p_value, method = "fdr")

# Order by significance
wilcox_results <- wilcox_results[order(wilcox_results$p_value), ]

meta2 <- meta
meta2$time_responder <- paste(
  meta2$timepoint,
  meta2$metabolic_responder,
  sep = "_"
)

#run maaslin2 based on metabolic responder status
# #fit_data <- Maaslin2(d.czm, meta2, 'maaslin/SV_metresponders_timepoint_interaction2', 
#                      transform = "none", 
#                      normalization = "CLR", 
#                      analysis_method = "LM", 
#                      fixed_effects = c("metabolic_responder","timepoint","time_responder"),
#                      reference = c("time_responder,1_pre_NR"),
#                      random_effects = c("participant"),
#                      standardize = FALSE)

# # samples significant for participant, BP_systolic, tot_cholesterol, LDL
# meta$participant<-as.factor(meta$participant)
# 
# #run some correlations with the delta clr
# #QIDS
# # QIDS_percent_res <- data.frame()
# # for (col in delta_cols) {
# #   res <- cor.test(delta_meta$QIDS_percent_change, delta_meta[[col]], method = "spearman")
# #   res.df <- tidy(res)
# #   res.df$SV <- col
# #   QIDS_percent_res <- rbind(QIDS_percent_res, res.df)
# # }
# # QIDS_percent_res$BH <- p.adjust(QIDS_percent_res$p.value, method = "BH")
# # SV 23 aka strep salivarius negatively associated with qids percent change aka higher strep with improving depression symptomology!!
# 
# # test QIDS correlation to metabolic changes
# post1 <- c(27:36)
# post2 <- names(delta_meta_post)[post1]
# 
# # QIDS_metabolic <- data.frame()
# # for (metric in post2){
# #   res <- cor.test(delta_meta$QIDS_percent_change, delta_meta[[metric]], method = "spearman")
# #   res.df <- tidy(res)
# #   res.df$metric <- metric
# #   QIDS_metabolic <- rbind(QIDS_metabolic, res.df)
# #  }
# # QIDS_metabolic$BH <- p.adjust(QIDS_metabolic$p.value, method = "BH") #no significant correlations between QIDS and any of the 'post' metabolic changes
# 
# # OASIS_percent_res <- data.frame()
# # for (col in delta_cols) {
# #   res <- cor.test(delta_meta$OASIS_percent_change, delta_meta[[col]], method = "spearman")
# #   res.df <- tidy(res)
# #   res.df$SV <- col
# #   OASIS_percent_res <- rbind(OASIS_percent_res, res.df)
# # }
# # OASIS_percent_res$BH <- p.adjust(OASIS_percent_res$p.value, method = "BH")
# # #nothing significant
# 
# # OASIS_metabolic <- data.frame()
# # for (metric in post2){
# #   res <- cor.test(delta_meta$OASIS_percent_change, delta_meta[[metric]], method = "spearman")
# #   res.df <- tidy(res)
# #   res.df$metric <- metric
# #   OASIS_metabolic <- rbind(OASIS_metabolic, res.df)
# #  }
# # OASIS_metabolic$BH <- p.adjust(OASIS_metabolic$p.value, method = "BH") #no significant correlations between OASIS and any of the 'post' metabolic changes
# 
# delta_meta_pre <- subset(delta_meta, timepoint == "1_pre")
# delta_meta_post <- subset(delta_meta, timepoint == "2_post")
# 
# #baseline correlations
# bl1 <- c(9, 18:26)
# bl2 <- names(delta_meta_pre)[bl1]
# 
# BL_cor <- data.frame()
# for (metric in bl2) {
#   for (col in delta_cols) {
#     res <- cor.test(delta_meta_pre[[metric]], delta_meta_pre[[col]], method = "spearman")
#     res.df <- tidy(res)
#     res.df$BaselineMetric <- metric
#     res.df$SV <- col
#     BL_cor <- rbind(BL_cor, res.df)
#   }
# }
# 
# BL_cor <- BL_cor %>%
#   arrange(BaselineMetric, SV) %>%  # <-- stable order
#   group_by(BaselineMetric) %>%
#   mutate(BH = p.adjust(p.value, method = "BH")) %>%
#   ungroup()
# #nothing significant. So baseline vales dont correlate with change in CLR of any SV
# 
# #post correlations
# post1 <- c(27:36)
# post2 <- names(delta_meta_post)[post1]
# 
# post_cor <- data.frame()
# for (metric in post2) {
#   for (col in delta_cols) {
#     res <- cor.test(delta_meta_post[[metric]], delta_meta_post[[col]], method = "spearman")
#     res.df <- tidy(res)
#     res.df$PostMetric <- metric
#     res.df$SV <- col
#     post_cor <- rbind(post_cor, res.df)
#   }
# }
# 
# post_cor <- post_cor %>%
#   arrange(PostMetric, SV) %>%  # <-- stable order
#   group_by(PostMetric) %>%
#   mutate(BH = p.adjust(p.value, method = "BH")) %>%
#   ungroup()
# #nothing significant

# correlate QIDS and OASIS with metabolic changes